In [5]:
#================================================#
# Machine Learning Lecture 2 in Python
# Author: Chong Ma
# Date  : June 26, 2017 
# Topic : Data Visulization in Python
#================================================#
In [6]:
#================================================#
# import Python library (just like library in R)
# most frequently used libraries: @_@ @_@ ...
# numpy, scipy, pandas, matplotlib, sympy etc.
#================================================#
# update jupyter notebook: pip install -U jupyter
import numpy as np
import scipy.stats as ss
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import pandas as pd
In [7]:
#=====================================================#
# ^_^           Data Visulization                ^_^  #
#=====================================================#
In [8]:
#=====================================================#
# ^_^                histograms                  ^_^  #
#=====================================================#
##generate normal distributed data
x_seq=np.linspace(-4,4,100)
rndnorm1=np.random.normal(0,1,10000)

# Create a figure instance, and the two subplots
fig=plt.figure(figsize=(12,4))

# the first subplot
ax1=fig.add_subplot(121) # equiv to ax1=fig.add_subplot(1,2,1)
ax1.hist(rndnorm1,bins=100,range=(-4,4),normed=True,
           color=(0.8,0.6,0.4,0.75))
ax1.set_title("Histogram of Normal Distribution")
ax1.set_xlabel("x")
ax1.set_ylabel("Density")
ax1.text(-3,0.35,r'$\mu=0,\ \sigma=1$')
ax1.grid(True)
ax1.plot(x_seq,ss.norm.pdf(x_seq),linewidth=2)

# the second subplot
ax2=fig.add_subplot(122)
# help(sns.distplot)
sns.distplot(rndnorm1,bins=50,kde=True,rug=True,
             kde_kws={"color": "g", "lw": 2, "label": "KDE"},
             color=(0.8,0.6,0.4,0.75), ax=ax2)
ax2.set_title("Histogram of Normal Distribution")
ax2.set_xlabel("x")
ax2.set_ylabel("Density")
ax2.set_xticks(np.arange(-6,6,1))
ax2.set_yticks(np.arange(0,0.5,0.1))
ax2.text(-3,0.35,r'$\mu=0,\ \sigma=1$')
ax2.legend()
# plt.savefig("hist1.pdf")
plt.show()
In [9]:
#=====================================================#
# ^_^       scatter plots(matlab)                ^_^  #
#=====================================================#
from scipy.stats import multivariate_normal
rndnorm2=np.random.normal(0,1,1000).reshape(500,2)
sigma=np.mat([[1,0.5],[0.5,1]])
rndnorm2=np.dot(rndnorm2,sigma)

x,y=np.mgrid[-5:5:0.05,-5:5:0.05]
pos=np.empty(x.shape+(2,))
pos[:,:,0]=x; pos[:,:,1]=y
plt.figure()
plt.scatter(rndnorm2[:,0],rndnorm2[:,1]) 
plt.contour(x,y,multivariate_normal([0,0], sigma).pdf(pos),
            levels=np.arange(0.001,0.4,0.02),
           colors='r',alpha=0.6)
plt.show()
In [10]:
#=====================================================#
# ^_^       scatter plots(Seaborn)               ^_^  #
#=====================================================#
# Create a figure instance, and the two subplots
rndnorm2=pd.DataFrame(rndnorm2)
rndnorm2.columns=["x","y"]

s1=sns.jointplot(x="x",y="y",data=rndnorm2)

# hexbin plot
with sns.axes_style("white"):
    sns.jointplot(x="x", y="y", kind="hex", color="k",data=rndnorm2);
# kernel density plot
sns.jointplot(x="x", y="y", data=rndnorm2, kind="kde");
plt.show()
In [11]:
#=====================================================#
# ^_^       scatter plots(Seaborn)               ^_^  #
#=====================================================#
f, (ax1,ax2) = plt.subplots(ncols=2,figsize=(12, 6))
sns.kdeplot(rndnorm2.x, rndnorm2.y, ax=ax1)
sns.rugplot(rndnorm2.x, color="g", ax=ax1)
sns.rugplot(rndnorm2.y, vertical=True, ax=ax1);

cmap = sns.cubehelix_palette(as_cmap=True, dark=0, light=1, reverse=True)
sns.kdeplot(rndnorm2.x, rndnorm2.y, cmap=cmap, n_levels=60, shade=True, ax=ax2)
plt.show()
In [12]:
#=====================================================#
# ^_^                curves                      ^_^  #
#=====================================================#
f1=lambda t: np.exp(-t) * np.sin(2*np.pi*t)

t1 = np.arange(0.0, 5.0, 0.1)
t2 = np.arange(0.0, 5.0, 0.02)

plt.figure(1)
plt.subplot(211)
plt.plot(t1, f1(t1), 'bo', t2, f1(t2), 'k')

plt.subplot(212)
plt.plot(t2, np.cos(2*np.pi*t2), 'r--')
plt.show()
In [ ]:
 
In [13]:
#=====================================================#
# ^_^             boxplots(seaborn)              ^_^  #
#=====================================================#
# generate a simulated data with 5 categories 
x=[]
[x.extend(np.random.normal(10*i,5*(i+1),100)) for i in range(5)]
x=np.array(x)
y=np.repeat(["AA","Delta","VA","JetBlue","Hawaiian"],[100]*5)
In [14]:
# convert to data frame in panda
simdat={"Delay":x, "Airlines":y}
simdat=pd.DataFrame(simdat)
In [15]:
sns.set(style="ticks", palette="muted",color_codes=True)
ax = sns.boxplot(x="Delay",y="Airlines",data=simdat,whis=np.inf, color="c")
plt.show()
In [16]:
sns.violinplot(x="Airlines",y="Delay",data=simdat,palette="Set3")
plt.show()
In [17]:
#=====================================================#
# ^_^                bar plots                   ^_^  #
#=====================================================#
objects = ('Python', 'C++', 'Java', 'Perl', 'R', 'SAS')
y_pos = np.arange(len(objects))
performance = [10,8,6,4,2,1]
colors = ['purple', 'gold', 'maroon', 'olive', 'navy', 'green']

plt.bar(y_pos, performance, align='center', alpha=0.6,
       color=colors)
plt.xticks(y_pos, objects)
plt.ylabel('Usage')
plt.title('Programming language usage') 
plt.show()
In [18]:
#=====================================================#
# ^_^                 pie-chart                  ^_^  #
#=====================================================#

# Data to plot
explode = (0.1, 0, 0, 0, 0, 0)  # explode 1st slice
 
plt.pie(performance, explode=explode, labels=objects, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=90)
plt.axis('equal')
plt.show()
In [19]:
xlist = np.linspace(-3.0, 3.0, 3)
ylist = np.linspace(-3.0, 3.0, 4)
X, Y = np.meshgrid(xlist, ylist)
Z = np.sqrt(X**2 + Y**2)

## reset the print style
np.set_printoptions(formatter={'float': '{:0.2f}'.format})
print("X=\n", X, "\n",
      "Y=\n", Y, "\n",
      "Z=\n", Z)
X=
 [[-3.00 0.00 3.00]
 [-3.00 0.00 3.00]
 [-3.00 0.00 3.00]
 [-3.00 0.00 3.00]] 
 Y=
 [[-3.00 -3.00 -3.00]
 [-1.00 -1.00 -1.00]
 [1.00 1.00 1.00]
 [3.00 3.00 3.00]] 
 Z=
 [[4.24 3.00 4.24]
 [3.16 1.00 3.16]
 [3.16 1.00 3.16]
 [4.24 3.00 4.24]]
In [20]:
#=====================================================#
# ^_^              contour plot                  ^_^  #
#=====================================================#
xlist = np.linspace(-3.0, 3.0, 3)
ylist = np.linspace(-3.0, 3.0, 4)
X, Y = np.meshgrid(xlist, ylist)
Z = np.sqrt(X**2 + Y**2)

plt.figure(figsize=(12,4))
plt.subplot(121)
cp1 = plt.contour(X, Y, Z,colors='b')
plt.clabel(cp1, inline=True, 
          fontsize=10)
plt.title('Contour Plot')
plt.xlabel('x (cm)')
plt.ylabel('y (cm)')

plt.subplot(122)
cp2 = plt.contour(X,Y,Z, colors="black", linestyles="dashed")
plt.clabel(cp2, inline=True, fontsize=12)
plt.title("contour plot")
plt.xlabel("x (cm)")
plt.ylabel("y (cm)")
plt.show()
c:\python36-32\lib\site-packages\matplotlib\contour.py:465: RuntimeWarning: invalid value encountered in ceil
  I = [np.floor(I[0]), np.ceil(I[1])]
In [21]:
## Another way to show contour plot
xlist = np.linspace(-3.0, 3.0, 100)
ylist = np.linspace(-3.0, 3.0, 100)
X, Y = np.meshgrid(xlist, ylist)
Z = np.sqrt(X**2 + Y**2)

plt.figure(figsize=(16,12))
plt.subplot(221)
cp3 = plt.contourf(X,Y,Z,cmap='magma')
plt.colorbar(cp3)
plt.title("contour plot")
plt.xlabel("x (cm)")
plt.ylabel("y (cm)")

plt.subplot(222)
cp4 = plt.contourf(X,Y,Z,cmap='Greys',
                  levels=np.arange(0,5,0.25))
plt.colorbar(cp4)
plt.title("contour plot")
plt.xlabel("x (cm)")
plt.ylabel("y (cm)")

plt.subplot(223)
cp5 = plt.contourf(X,Y,Z,cmap='hot',
                  levels=np.linspace(0,5,7))
plt.colorbar(cp5)
plt.title("contour plot")
plt.xlabel("x (cm)")
plt.ylabel("y (cm)")

plt.subplot(224)
cp6 = plt.contourf(X,Y,Z,cmap='inferno')
plt.colorbar(cp6)
plt.title("contour plot")
plt.xlabel("x (cm)")
plt.ylabel("y (cm)")

plt.tight_layout()
# plt.savefig("foo.pdf")
plt.show()
In [22]:
#=====================================================#
# ^_^              heatmap(seaborn)              ^_^  #
#=====================================================#
## a DataFrame with three variables: 
## year, month and passengers
flights = sns.load_dataset("flights") 
flights = flights.pivot("month", "year", "passengers")
flights
Out[22]:
year 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960
month
January 112 115 145 171 196 204 242 284 315 340 360 417
February 118 126 150 180 196 188 233 277 301 318 342 391
March 132 141 178 193 236 235 267 317 356 362 406 419
April 129 135 163 181 235 227 269 313 348 348 396 461
May 121 125 172 183 229 234 270 318 355 363 420 472
June 135 149 178 218 243 264 315 374 422 435 472 535
July 148 170 199 230 264 302 364 413 465 491 548 622
August 148 170 199 242 272 293 347 405 467 505 559 606
September 136 158 184 209 237 259 312 355 404 404 463 508
October 119 133 162 191 211 229 274 306 347 359 407 461
November 104 114 146 172 180 203 237 271 305 310 362 390
December 118 140 166 194 201 229 278 306 336 337 405 432
In [23]:
# create a figure instance with four subplots
fig=plt.figure(figsize=(16,12))
ax1=fig.add_subplot(221)
ax2=fig.add_subplot(222)
ax3=fig.add_subplot(223)
ax4=fig.add_subplot(224)

# four heatmap in different versions
sns.heatmap(flights,ax=ax1)
sns.heatmap(flights, annot=True, fmt="d", ax=ax2, cmap="viridis")
sns.heatmap(flights, center=flights.loc["January", 1955], ax=ax3, cmap="plasma")
# generate a random dataset 
data = np.random.randn(50, 20)
sns.heatmap(data, xticklabels=2, yticklabels=False, ax=ax4)
plt.show()
In [24]:
#=====================================================#
# ^_^              Image Process                 ^_^  #
#=====================================================#
from scipy import misc
face = misc.face()
print(face.shape, "\n", face.max,"\n", face.dtype)
(768, 1024, 3) 
 <built-in method max of numpy.ndarray object at 0x13629D18> 
 uint8
In [25]:
## define a tint function that increase the lightness of
## the image
def tint(imag, percent): 
    if np.any(imag>1): 
        imag=imag/255 
    return imag+(np.ones(imag.shape)-imag)*percent
# tint=lambda imag, percent: imag+(np.ones(imag.shape)-imag)*percent

plt.figure()
plt.subplot(221)
plt.axis("off")
plt.imshow(face)

plt.subplot(222)
plt.axis("off")
plt.imshow(tint(imag=face,percent=0.4))

plt.subplot(223)
plt.axis("off")
plt.imshow(tint(imag=face,percent=0.6))

plt.subplot(224)
plt.axis("off")
plt.imshow(tint(imag=face,percent=0.8))

plt.show()